# -*- coding: utf-8 -*-
"""
Created on Fri Mar  1 12:59:34 2019

@author: ba8rb2
"""

import pandas as pd
import re
import numpy as np
import time as ti
import os


from matplotlib import pyplot as plt

import gensim as gs
from gensim.models.coherencemodel import CoherenceModel as cm

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
  
import pyLDAvis.gensim
from nltk.collocations import *
import pickle
from langdetect import detect

import warnings
warnings.filterwarnings('ignore') 
#%%
## Change User Name to load data
user = "ba8rb2\\"

## Add User Name and Pathnames here:

## Path with REF-Data
path ="C:\\Users\\" + user + "Dropbox\\Andere Aufgaben\\Artikel\\Schubi\\RAE-REF\\"

## Root-Path
root ="C:\\Users\\" + user + "Dropbox\\Andere Aufgaben\\Artikel\\Schubi\\Python\\"

## Output Storage
output = root + "output\\"


#%%

# Load Dataframe

df = pd.read_excel(path + "Sociology_2014.xlsx","Output")
keys = [x for x in df]
df = df.dropna(subset = ["Abstract"])


#%%

def text_cleaner(df, textlist, language, newvar):
    """
    The text_cleaner function is used to clean, stem, lemmatize and tokenaize 
    a variable stored in a pandas dataframe.
    
    Parameters
    ---------------------------------------
    * df: Dataframe
    
    * textlist : Name of the variable where the text-files are stored
    
    * language : Set the language of the text-files (usng the langdetect package)
    
    * newvar : Name of the newly generated output variable

    """



    begin_time = ti.time()

    ## Definieren der funktionsinternen Variablen
    textlist = df[textlist]
    df[newvar] = np.NAN
    stop_words = set(stopwords.words(language))
    ps = PorterStemmer()
    newvar_append = []
    lemmatizer = WordNetLemmatizer() 
    
    
    ## Specific List with Sociological Stopwords
    
    booklist_clean = ["this book", "book","author","authors",
                      "this study", "this article", "articl","this",
                      "aim","paper","study","purpose","social","chapter",
                      "provide","provides","examine","examines",
                      "investigate","investigates","relevant","explores",
                      "explore","sociology","within","paper","concern",
                      "use","studi","volume","volum","draw","research",
                      "article","find","finds","understand","understanding",
                      "argue","argues","argu","concept","conceptualize",
                      "analysis","analyse","use","uses","sociolog","new","sociological"
                      "studi","discuss","context","account","way",
                      "draw","make","draws","makes","contributes","contribute",
                      "see","e.g","e.g.", "important","sociological","studies",
                      "also","recent", "also","approach", "first","may",
                      "approach","use","using","sociologists",
                      "sociological","approach"]
    
    
    print("begin cleaning procecss")
    counter = 0
    for t in textlist:
        print("column:",counter, ". Text begins with:",t[:50],"\n")
        
        ## Additionally creates a list with Author names for cleaning
        try:
            author_list = df.iloc[counter]["Authors"].split(";")
            author_list = [re.sub("^ ","",x) for x in author_list]
            author_list = [x.split(" ") for x in author_list]
            author_list = [item for sublist in author_list for item in sublist]
        except AttributeError:
            author_list = []
        
        ## keep only words
        t = re.sub("\W"," ",t)
        
        
        ## Tokenization and cleaning
        textobject = word_tokenize(t)
        textobject = [x.lower() for x in textobject]
        textobject = [lemmatizer.lemmatize(x) for x in textobject]
        textobject = [x for x in textobject 
                      if x not in booklist_clean 
                      if x not in author_list 
                      if x not in stop_words 
                      if len(x) > 3 and not re.search("[0-9]+",x)]
        textobject = [x for x in textobject]
        newvar_append.append(textobject)
        counter +=1
    
    df[newvar] = newvar_append
    
    end_time = ti.time()
    
    print("time taken for cleaning_process in Minutes:" ,(end_time-begin_time)/60)
    return(df)


def most_words(df,word_var,word_num):
    """
    This function selects the given number of words and gives a wordlist of the
    n-most common tokens.
    
    Retunrs a dataframe with the n-Most common words and information on the total
    corpus size.
    
    Parameters
    ---------------------------------------
    * df : Dataframe
    
    * word_var : Name of the variable containing the "bag of words"
        
    * word_num : The number of most common words/tokens present in the word_var
    """
    
    ## Generate the wordlist
    word_list = list(df[word_var])
    word_list = [item for sublist in word_list for item in sublist]
    print("Number of unique words:",len(set(word_list)))
    corpus_size = len(set(word_list))
    df_part = pd.DataFrame(index= range(len(word_list)),columns=["no_words","words"])

    df_part["no_words"] = 1
    df_part["words"] = word_list
    
    ## Generate the word-counter
    df_part_grouped = df_part.groupby(by="words")
    df_part_grouped = df_part_grouped.sum()
    df_part_grouped = df_part_grouped.sort_values(by="no_words",ascending=False)
    df_part_grouped = df_part_grouped[0:word_num]
    
    ## Return a dataframe n-most common words and the courpus_size
    return(df_part_grouped, corpus_size)


#%%

# Step 1: Clean the texts


df = text_cleaner(df,"Abstract","english","cleaned_abstracts")

df_mostwords, no_unique_words = most_words(df,"cleaned_abstracts",50)

#test = df[["Abstract","cleaned_abstracts"]]

#%%

# Step 2: Create a figure containing the 50 most prevalent/ most common tokens
import seaborn as sbn
sbn.set()

y = df_mostwords["no_words"]
x = range(len(df_mostwords))
xlab = df_mostwords.index

## open figure canvas and create a barplot
plt.figure()
plt.bar(x,y)

## Set ticks, ticklabels and options for axis-labels
plt.xticks(ticks=x, labels=xlab, rotation=90)
plt.xlabel("Most Prevalent Tokens", fontsize=18, fontweight="bold")
plt.ylabel("Number of Appearences", fontsize=18, fontweight="bold")
plt.tight_layout()

## Save and close figure
plt.savefig(output + "50_words.png", dpi=1200)

plt.close()


#%%

# Step 3: Create a textcorpus and a bag of words
corpus = df["cleaned_abstracts"]
len(corpus)


dictionary = gs.corpora.Dictionary(documents=corpus)    ## Create a bag of word.dictionary
text_corpus = [dictionary.doc2bow(text) for text in corpus]

# Save pickle-data and gensim-dictionary for further investigation
pickle.dump(corpus, open('corpus_REF.pkl', 'wb'))
dictionary.save('dictionary.gensim')

size = len(corpus)
len(dictionary)
len(text_corpus)

#%%

# Step 4: exclude lines without abstract data
df = df.dropna(subset=["Abstract"])
df = df.reset_index()
del(df["index"])


## Calculate topic-models, perplexity and coherence in a given topic-range
## in addition to an interactive html for further investigation of the topic space
topic_range = range(5,101)


all_time_start = ti.time()
for n in topic_range:
    perplexity = []
    zero_in = ti.time()
    a = ti.time()
    print("starting calculation")
    NUM_TOPICS = n
    ldamodel = gs.models.ldamulticore.LdaMulticore(text_corpus , num_topics = NUM_TOPICS,
                                           id2word=dictionary, passes=15, iterations=300,
                                           chunksize = len(df)/10,
                                           eval_every=5)
    print("saving the model")
    savename = "REF_sociology" + str(NUM_TOPICS) + str(len(text_corpus)) +  ".gensim"
    ldamodel.save(savename)
    print("end saving the model")

    topics = ldamodel.print_topics(num_words=25)
    
    print("exporting topics")
    topics2 = []
    for t in range(NUM_TOPICS):
        print(ldamodel.print_topic(t))
        topics2.append(ldamodel.print_topic(t, topn=25))
    topic_data = pd.DataFrame(data=topics2)
    topic_data.to_excel(output + "Topics\\" +"Topic_" + str(NUM_TOPICS) +".xlsx")
    #overall_topics.append(topics)
    b=ti.time()
    print("time taken for:",str(NUM_TOPICS),"-topics:",b-a)
    lda = gs.models.ldamodel.LdaModel.load(savename)


    """
    Topic-Verteilung über die Texte hinweg generieren und dann in separate
    Excel-Files speichern
    """
    
    time_start = ti.time()
    print("staring to convert corpus to bow-format and getting probabilities for text-topic-assignment")
    corpus2 = [dictionary.doc2bow(text) for text in corpus]
    topic_distribution_over_texts = []
    counter = 0
    for c in corpus2:
        topic_distribution_over_texts.append(ldamodel.get_document_topics(c))
        print("currently at text:", counter+1, df.iloc[counter])
        counter +=1    
    
    df["Topic_Distribution"] = np.nan
    df["Topic_Distribution"] = topic_distribution_over_texts
    
    ## Creating new variables for topics:
    
    for t in range(n):
        df[t] = 0
    
    counter = 0
    topic_df_list = []

    for row in topic_distribution_over_texts:
        rows = list(np.repeat(0,n))
        for tup in row:
            rows[tup[0]] = tup[1]
            
           # df.ix[r, tup[0]] = tup[1]
            print("currently at row", counter, "and tuple:", tup)
        topic_df_list.append(tuple(rows))
        counter += 1
    df_topics = pd.DataFrame(data= topic_df_list)
    
    time_end =ti.time()
    print("assigning topics took:", time_end - time_start)   
    
    print("saving")
    
    ## Assign Correct topics:
    
    for x in df_topics.keys():
        df[x] = df_topics[x]
        print(x)
    
    df.to_excel(output + "Topics\\" + "dataframe_Topic_" + str(NUM_TOPICS) + ".xlsx")
    
    del(df_topics)


    """
    Calculate Perplexity-Values
    """
    
    p = lda.log_perplexity(text_corpus)
    perplexity.append(p)

    perplexity_df = pd.DataFrame(perplexity, columns=["Perplexity"])
    perplexity_df["no_topics"] = NUM_TOPICS
    perplexity_df.to_excel(output + "Perplexities\\" +"Topic_" + str(NUM_TOPICS) +"_perplexity.xlsx")

    """
    Calculcate log UMass-Coherence
    """    
    coherences = [NUM_TOPICS]
    column_namelist = ["no_topics","u_mass"]
    
    for item in range(n):
        column_namelist.append("u_mass_topic" + str(item))

    # U_Mass-Coherence
    coherence_model_umass = cm(model=ldamodel, corpus=corpus2, coherence='u_mass')
    coherence_umass = coherence_model_umass.get_coherence()
    
    # U_Mass-Coherence for each topic:
    topics_umass = coherence_model_umass.get_coherence_per_topic()
    
    coherences.append(coherence_umass)
    for t in topics_umass:
        coherences.append(t)
    print("\for Topic",NUM_TOPICS,"\for Topic",NUM_TOPICS," umass coherence scores:",coherence_umass)
    coherence_df = pd.DataFrame(index=[NUM_TOPICS], columns=column_namelist)
    coherence_df.iloc[0] = coherences
    coherence_df.to_excel(output + "Coherence\\" + "Topic_" + str(NUM_TOPICS) + ".xlsx")
    
    print("\n\n Perplexity for model with", str(NUM_TOPICS),"number of topics is:",p)
    print("\n\n Generating HTML")
    lda_display = pyLDAvis.gensim.prepare(lda, text_corpus, dictionary)
    pyLDAvis.save_html(lda_display, output + "htmls\\"+ "Topic_" + str(NUM_TOPICS) + ".html")
    
    
	## Create a Topic Space based on Multidimensional Scaling:
    ## Warning! Takes very long. Outcomment if you don't need the topic-space
     
    print("saving topic_coordinates and token table")
    topic_coordinates = lda_display.topic_coordinates
    topic_coordinates.to_excel(output + "Topic_Coordinates\\" + "Topic_coordinates_" + str(NUM_TOPICS) + ".xlsx")
    token_table = lda_display.token_table
    token_table.to_excel(output + "Token_Tables\\" + "Topic_coordinates_" + str(NUM_TOPICS) + ".xlsx")

    zero_out = ti.time()
    
    print("calculating lda with", NUM_TOPICS,"topics took", (zero_out-zero_in)/60, "minutes")

    #pyLDAvis.show(lda_display)
    
all_time_end = ti.time()
print("total time taken in minutes:", (all_time_end-all_time_start)/60)







    
    
